package com.oig.sys.base.app.spider;

import com.oig.common.util.JsonUtil;
import lombok.extern.slf4j.Slf4j;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

import java.util.List;

@Slf4j
public class GwHsCodePageProcessor implements PageProcessor {

    private final static Site site = Site.me()
            .setSleepTime(3*1000)
            .addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3")
            .addHeader("Accept-Language", "zh-CN,zh;q=0.9")
            .setCharset("UTF-8")
            .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36");

    @Override
    public Site getSite() {
        return site;
    }

    @Override
    public void process(Page page) {
        log.debug("process page:{}", page.getUrl());
        //log.debug("html:{}",page.getHtml());
        List<String> mainList = page.getHtml().xpath("//body/div[@id='wrap']/div/div[@id='code-info']/div[1]/table/tbody/tr/td[2]/text()").all();
        log.debug("基本信息：{}", JsonUtil.toJSONString(mainList));
        List<String> taxList = page.getHtml().xpath("//body/div[@id='wrap']/div/div[@id='code-info']/div[3]/table/tbody/tr/td[2]/text()").all();
        log.debug("税率信息：{}", JsonUtil.toJSONString(taxList));
        List<String> jgList = page.getHtml().xpath("//body/div[@id='wrap']/div/div[@id='code-info']/div[7]/table/tbody/tr/td[1]/text()").all();
        log.debug("监管条件：{}", JsonUtil.toJSONString(jgList));
        List<String> jyList = page.getHtml().xpath("//body/div[@id='wrap']/div/div[@id='code-info']/div[9]/table/tbody/tr/td[1]/text()").all();
        log.debug("检验检疫类别：{}", JsonUtil.toJSONString(jyList));

        List<String> xdNameList = page.getHtml().xpath("//body/div[@id='wrap']/div/div[@id='code-info']/div[11]/table/tbody/tr/td[1]/text()").all();
        log.debug("协定税率名：{}", JsonUtil.toJSONString(xdNameList));
        List<String> xdTaxList = page.getHtml().xpath("//body/div[@id='wrap']/div/div[@id='code-info']/div[11]/table/tbody/tr/td[2]/text()").all();
        log.debug("协定税率：{}", JsonUtil.toJSONString(xdTaxList));
        List<String> rcepNameList = page.getHtml().xpath("//body/div[@id='wrap']/div/div[@id='code-info']/div[13]/table/tbody/tr/td[1]/text()").all();
        log.debug("RCEP税率名：{}", JsonUtil.toJSONString(rcepNameList));
        List<String> rcepTaxList = page.getHtml().xpath("//body/div[@id='wrap']/div/div[@id='code-info']/div[13]/table/tbody/tr/td[2]/text()").all();
        log.debug("RCEP税率：{}", JsonUtil.toJSONString(rcepTaxList));

        page.putField("mainList", mainList);
        page.putField("taxList", taxList);
        page.putField("jgList",jgList);
        page.putField("jyList",jyList);

        page.putField("xdNameList", xdNameList);    //协定税率
        page.putField("xdTaxList", xdTaxList);    //协定税率
        page.putField("rcepNameList",rcepNameList); //RCEP 税率
        page.putField("rcepTaxList",rcepTaxList); //RCEP 税率
    }
}
